knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(oharac)
library(tidyverse)
library(here)
source(here('common_fxns.R'))
library(mice)Read in processed grouping traits, then perform MICE separately on various taxonomic groupings. While we have data for generation time and fecundity for many species, they will not be used to assign functional entities, as they are more related to intraspecific interactions than interspecific interactions. They can, however, be used to help impute missing values from other traits.
See individual trait scripts.
Read in the processed grouping traits data; bind WoRMS classifications; then filter to just the included taxonomic groups.
wcol_levels <- c('rf', 'pel', 'ben', 'bp') ### not ordered
mob_levels <- c('ses', 'sed', 'mob', 'mig') ### in order from least to most mobile
worms_spp <- assemble_worms()
### file created in script 1b:
keep_df <- read_csv(here('int/mice_grouping_taxa_to_keep.csv'))
vert_keep_df <- keep_df %>%
filter(phylum == 'chordata') %>%
select(class, gp) %>%
distinct()
vert_keep <- vert_keep_df$gp %>% setNames(vert_keep_df$class)
traits_premice_all <- read_csv(here_anx('mice/gp_traits_mice_preimputation.csv')) %>%
mutate(wcol = factor(wcol, levels = wcol_levels, ordered = FALSE),
adult_mob = factor(adult_mob, levels = mob_levels, ordered = TRUE)) %>%
inner_join(worms_spp, by = 'species') %>%
mutate(gp = case_when(phylum == 'chordata' ~ vert_keep[class],
phylum == 'mollusca' & order %in% keep_df$order ~ class,
!phylum %in% c('chordata', 'mollusca') & phylum %in% keep_df$phylum ~ phylum)) %>%
filter(!is.na(gp))
### 124,462 across all spp; 119,219 for kept taxaFor all groups, include all traits, as well as order and family as additional predictors. With family, imputation is much slower, but with order only, RMSE differences seem high (quick analysis looking at some invertebrates):
| gp | log_l_mean | log_f_mean | troph_mean | age_mat_mean |
|---|---|---|---|---|
| annelida | 0.720 | 0.772 | 0.244 | 0.108 |
| arthropoda | 1.17 | 2.87 | 0.279 | 0.914 |
| cnidaria | 0.572 | NaN | 0.256 | 1.58 |
| echinodermata | 0.852 | 0.788 | 0.482 | 0.149 |
keep_ranks <- c('order', 'family')
traits_prepped <- traits_premice_all %>%
select(log_l, troph,
log_f, age_mat,
wcol, adult_mob,
all_of(keep_ranks), species, gp, phylum) %>%
mutate(order = factor(order, ordered = FALSE),
family = factor(family, ordered = FALSE),
species = factor(species, ordered = FALSE)) %>%
distinct()
invert_traits <- traits_prepped %>%
filter(!phylum %in% c('chordata', 'mollusca')) %>%
select(-phylum)Create a blank run (no iterations) to generate a prediction matrix and a methods vector, then update these as needed.
blank_run <- mice(invert_traits %>% select(-gp), maxit = 0)
### set up predictor matrix
pred_mtx <- blank_run$pred
pred_mtx[ , 'species'] <- 0
### species column is all zeros - ignored for imputation
pred_mtx[c('species'), ] <- 0
pred_mtx[keep_ranks, ] <- 0
### don't impute values for these... (should not be an issue since no NAs)Row names are variables to be imputed; column names are variables used to impute. An all-zero column indicates: do not use this variable to impute; an all-zero row indicates: do not impute this variable.
pred_mtx## log_l troph log_f age_mat wcol adult_mob order family species
## log_l 0 1 1 1 1 1 1 1 0
## troph 1 0 1 1 1 1 1 1 0
## log_f 1 1 0 1 1 1 1 1 0
## age_mat 1 1 1 0 1 1 1 1 0
## wcol 1 1 1 1 0 1 1 1 0
## adult_mob 1 1 1 1 1 0 1 1 0
## order 0 0 0 0 0 0 0 0 0
## family 0 0 0 0 0 0 0 0 0
## species 0 0 0 0 0 0 0 0 0
Values of "" indicate no method to impute (complete variable, no imputation needed). From ?mice:
By default, the method uses
pmm, predictive mean matching (numeric data);logreg, logistic regression imputation (binary data, factor with 2 levels);polyreg, polytomous regression imputation for unordered categorical data (factor > 2 levels);polr, proportional odds model for (ordered, > 2 levels).
method_vec <- blank_run$meth
method_vec## log_l troph log_f age_mat wcol adult_mob order family
## "pmm" "pmm" "pmm" "pmm" "" "polr" "" ""
## species
## ""
Note that mice fails on molluscs at the phylum level - presumably too many orders/families. Identify classes of molluscs in AquaMaps, limit to just those (including family Conidae, from IUCN) and run those separately.
From Stack Overflow: > This functions adds a star to variable names in the mice iteration history to signal that a ridge penalty was added. In that case, it also adds an entry to loggedEvents.
This happens when it was unable to invert the QR or SVD decomposition of the covariate matrix.
So, I think what’s happening is that when you include this nominal variable as a predictor for some of those pmm variables the resulting predictor matrix for that variable is either collinear or nearly collinear. mice will drop any variables that are exactly collinear, but if they aren’t quite collinear it might still lead to a matrix that is nearly linearly dependent and can’t be inverted. In that case mice adds a small term to the diagonal elements which allows an inverse to be taken and the computation to continue.
for(p in unique(invert_traits$gp)) {
### p <- unique(invert_traits$gp)[1]
message('Processing phylum ', p, '...')
### set up file for imputed traits (all small enough for Git?)
post_impute_f <- sprintf(here_anx('mice/mice_out/gp_traits_%s_mice.csv'), p)
### .Rdata path for saving the imputation `mids` object
impute_object_f <- sprintf(here_anx('mice/rdata/gp_traits_%s_mice_results.RData'), p)
if(!file.exists(post_impute_f)) {
### unlink(post_impute_f)
### filter to just this phylum and drop the group column
p_traits <- invert_traits %>%
filter(gp == p) %>%
select(-gp)
### impute!
ptm <- proc.time()
imp <- mice(p_traits,
m = 5, maxit = 10,
method = method_vec,
predictorMatrix = pred_mtx,
seed = 42)
message('Phylum ', p, ' took ', (proc.time() - ptm)[3], ' seconds to process')
### complete the trait set and write out the traits post-imputation. For
### file size, convert wcol and adult_mob to integers instead of character
traits_complete <- complete(imp, 'long') %>%
select(-all_of(keep_ranks)) %>% ### can reattach higher ranks easily
mutate(log_l = round(log_l, 5),
log_f = round(log_f, 5),
age_mat = round(age_mat, 5))
### from above:
### wcol_levels <- c('rf', 'pel', 'ben', 'bp')
### mob_levels <- c('ses', 'sed', 'mob', 'mig')
write_csv(traits_complete, post_impute_f)
### write out results to .RData?
save(imp, file = impute_object_f)
} ### end analysis for imputation
### plot results for inspection
load(impute_object_f) ### stored as imp
cat('\n\n####', p, '\n\n') ### put in a header line
# print(summary(imp))
print(plot(imp)) ### do troph, log_f, and age_mat have convergence issues?
# print(densityplot(imp)) ### doesn't show categoricals?
}Earlier, iterating over molluscs was problematic, so splitting into classes here. Will also try running all molluscs, just to see if filtering down the mollusc orders helped avoid the problem from before.
mollusc_traits <- traits_prepped %>%
filter(phylum == 'mollusca') %>%
select(-phylum)
for(cls in unique(mollusc_traits$gp)) {
### cls <- unique(mollusc_traits$gp)[1]
message('Processing class ', cls, '...')
### set up file for imputed traits (all small enough for Git?)
post_impute_f <- sprintf(here_anx('mice/mice_out/gp_traits_mollusca_%s_mice.csv'), cls)
### .Rdata path for saving the imputation `mids` object
impute_object_f <- sprintf(here_anx('mice/rdata/gp_traits_mollusca_%s_mice_results.RData'), cls)
if(!file.exists(post_impute_f)) {
### unlink(post_impute_f)
### filter to just this phylum and drop the group column
cls_traits <- mollusc_traits %>%
filter(gp == cls) %>%
select(-gp)
### impute!
ptm <- proc.time()
imp <- mice(cls_traits,
m = 5, maxit = 10,
method = method_vec,
predictorMatrix = pred_mtx,
seed = 42)
message('Mollusca class ', cls, ' took ', (proc.time() - ptm)[3], ' seconds to process')
### complete the trait set and write out the traits post-imputation. For
### file size, convert wcol and adult_mob to integers instead of character
traits_complete <- complete(imp, 'long') %>%
select(-all_of(keep_ranks)) %>% ### can reattach higher ranks easily
mutate(log_l = round(log_l, 5),
log_f = round(log_f, 5),
age_mat = round(age_mat, 5))
### from above:
### wcol_levels <- c('rf', 'pel', 'ben', 'bp')
### mob_levels <- c('ses', 'sed', 'mob', 'mig')
write_csv(traits_complete, post_impute_f)
### write out results to .RData?
save(imp, file = impute_object_f)
} ### end analysis for imputation
### plot results for inspection
load(impute_object_f) ### stored as imp
cat('\n\n#### Mollusca: ', cls, '\n\n') ### put in a header line
# print(summary(imp))
print(plot(imp)) ### do troph, log_f, and age_mat have convergence issues?
# print(densityplot(imp)) ### doesn't show categoricals?
}message('Processing phylum mollusca as a whole... ')
### set up file for imputed traits (all small enough for Git?)
post_impute_f <- sprintf(here_anx('mice/mice_out/gp_traits_mollusca_combined_mice.csv'), cls)
### .Rdata path for saving the imputation `mids` object
impute_object_f <- sprintf(here_anx('mice/rdata/gp_traits_mollusca_combined_mice_results.RData'), cls)
if(!file.exists(post_impute_f)) {
### unlink(post_impute_f)
### filter to just this phylum and drop the group column
p_traits <- mollusc_traits %>%
select(-gp)
### impute!
ptm <- proc.time()
imp <- mice(p_traits,
m = 5, maxit = 10,
method = method_vec,
predictorMatrix = pred_mtx,
seed = 42)
message('Mollusca combined took ', (proc.time() - ptm)[3], ' seconds to process')
### complete the trait set and write out the traits post-imputation. For
### file size, convert wcol and adult_mob to integers instead of character
traits_complete <- complete(imp, 'long') %>%
select(-all_of(keep_ranks)) %>% ### can reattach higher ranks easily
mutate(log_l = round(log_l, 5),
log_f = round(log_f, 5),
age_mat = round(age_mat, 5))
### from above:
### wcol_levels <- c('rf', 'pel', 'ben', 'bp')
### mob_levels <- c('ses', 'sed', 'mob', 'mig')
write_csv(traits_complete, post_impute_f)
### write out results to .RData?
save(imp, file = impute_object_f)
} ### end analysis for imputation
### plot results for inspection
load(impute_object_f) ### stored as imp
cat('\n\n#### Mollusca combined\n\n') ### put in a header line# print(summary(imp))
print(plot(imp)) ### do troph, log_f, and age_mat have convergence issues?# print(densityplot(imp)) ### doesn't show categoricals?chordata_traits <- traits_prepped %>%
filter(phylum == 'chordata') %>%
select(-phylum)
chordates <- unique(chordata_traits$gp)
for(cls in chordates) {
### cls <- chordates[1]
message('Processing chordata class ', cls, '...')
### set up file for imputed traits (all small enough for Git?)
post_impute_f <- sprintf(here_anx('mice/mice_out/gp_traits_chordata_%s_mice.csv'), cls)
### .Rdata path for saving the imputation `mids` object
impute_object_f <- sprintf(here_anx('mice/rdata/gp_traits_chordata_%s_mice_results.RData'), cls)
if(!file.exists(post_impute_f)) {
### unlink(post_impute_f)
### filter to just this phylum and drop the group column
cls_traits <- chordata_traits %>%
filter(gp == cls) %>%
select(-gp) %>%
distinct()
if(cls == 'aves') {
### this one row is causing an error for some reason...
### Error in apply(draws, 2, sum) : dim(X) must have a positive length
### it's the only observation of a Brown Noddy, which is not in AquaMaps
cls_traits <- cls_traits %>%
filter(species != 'anous stolidus')
}
### impute!
ptm <- proc.time()
imp <- mice(cls_traits,
m = 5, maxit = 10,
method = method_vec,
predictorMatrix = pred_mtx,
seed = 42)
message('Chordata class ', cls, ' took ', (proc.time() - ptm)[3], ' seconds to process')
### complete the trait set and write out the traits post-imputation. For
### file size, convert wcol and adult_mob to integers instead of character
traits_complete <- complete(imp, 'long') %>%
select(-all_of(keep_ranks)) %>% ### can reattach higher ranks easily
mutate(log_l = round(log_l, 5),
log_f = round(log_f, 5),
age_mat = round(age_mat, 5))
### from above:
### wcol_levels <- c('rf', 'pel', 'ben', 'bp')
### mob_levels <- c('ses', 'sed', 'mob', 'mig')
write_csv(traits_complete, post_impute_f)
### write out results to .RData?
save(imp, file = impute_object_f)
} ### end analysis for imputation
### plot results for inspection
load(impute_object_f) ### stored as imp
cat('\n\n#### Mollusca: ', cls, '\n\n') ### put in a header line
# print(summary(imp))
print(plot(imp)) ### do troph, log_f, and age_mat have convergence issues?
# print(densityplot(imp)) ### doesn't show categoricals?
} iter imp variable 1 1 log_l* troph* log_f* age_mat* adult_mob 1 2 log_l* troph* log_f* age_mat* adult_mob 1 3 log_l* troph* log_f* age_mat* adult_mob 1 4 log_l* troph* log_f* age_mat* adult_mob 1 5 log_l* troph* log_f* age_mat* adult_mob 2 1 log_l* troph log_f* age_mat* adult_mob 2 2 log_l* troph* log_f* age_mat* adult_mob 2 3 log_l* troph* log_f* age_mat* adult_mob 2 4 log_l* troph* log_f* age_mat* adult_mob 2 5 log_l* troph* log_f* age_mat* adult_mob 3 1 log_l* troph* log_f* age_mat* adult_mob 3 2 log_l* troph* log_f* age_mat* adult_mob 3 3 log_l* troph* log_f* age_mat* adult_mob 3 4 log_l troph* log_f* age_mat* adult_mob 3 5 log_l* troph* log_f* age_mat* adult_mob 4 1 log_l* troph* log_f* age_mat adult_mob 4 2 log_l* troph* log_f* age_mat* adult_mob 4 3 log_l* troph* log_f* age_mat* adult_mob 4 4 log_l* troph* log_f* age_mat* adult_mob 4 5 log_l* troph* log_f* age_mat* adult_mob 5 1 log_l* troph* log_f* age_mat* adult_mob 5 2 log_l* troph* log_f* age_mat* adult_mob 5 3 log_l* troph* log_f* age_mat* adult_mob 5 4 log_l troph* log_f* age_mat* adult_mob 5 5 log_l* troph* log_f* age_mat* adult_mob 6 1 log_l* troph* log_f* age_mat* adult_mob 6 2 log_l* troph* log_f* age_mat* adult_mob 6 3 log_l* troph* log_f* age_mat* adult_mob 6 4 log_l* troph* log_f* age_mat* adult_mob 6 5 log_l* troph* log_f* age_mat* adult_mob 7 1 log_l* troph* log_f* age_mat* adult_mob 7 2 log_l* troph* log_f* age_mat* adult_mob 7 3 log_l* troph* log_f* age_mat* adult_mob 7 4 log_l troph* log_f* age_mat* adult_mob 7 5 log_l* troph* log_f* age_mat* adult_mob 8 1 log_l* troph* log_f* age_mat* adult_mob 8 2 log_l* troph* log_f* age_mat* adult_mob 8 3 log_l* troph* log_f* age_mat* adult_mob 8 4 log_l* troph* log_f* age_mat* adult_mob 8 5 log_l* troph* log_f* age_mat* adult_mob 9 1 log_l* troph* log_f* age_mat* adult_mob 9 2 log_l* troph* log_f* age_mat* adult_mob 9 3 log_l* troph* log_f* age_mat* adult_mob 9 4 log_l* troph* log_f* age_mat* adult_mob 9 5 log_l* troph* log_f* age_mat* adult_mob 10 1 log_l* troph* log_f* age_mat* adult_mob 10 2 log_l* troph* log_f* age_mat* adult_mob 10 3 log_l* troph* log_f* age_mat* adult_mob 10 4 log_l* troph* log_f age_mat* adult_mob 10 5 log_l* troph* log_f* age_mat* adult_mob
iter imp variable 1 1 log_l* troph* log_f* age_mat* adult_mob 1 2 log_l* troph* log_f* age_mat* adult_mob 1 3 log_l* troph* log_f* age_mat* adult_mob 1 4 log_l* troph* log_f* age_mat* adult_mob 1 5 log_l* troph* log_f* age_mat* adult_mob 2 1 log_l* troph* log_f* age_mat* adult_mob 2 2 log_l* troph* log_f* age_mat* adult_mob 2 3 log_l* troph* log_f* age_mat* adult_mob 2 4 log_l* troph* log_f* age_mat* adult_mob 2 5 log_l troph* log_f* age_mat* adult_mob 3 1 log_l* troph* log_f* age_mat* adult_mob 3 2 log_l* troph* log_f* age_mat* adult_mob 3 3 log_l troph* log_f* age_mat* adult_mob 3 4 log_l* troph* log_f* age_mat* adult_mob 3 5 log_l* troph* log_f* age_mat* adult_mob 4 1 log_l* troph* log_f* age_mat* adult_mob 4 2 log_l* troph* log_f* age_mat* adult_mob 4 3 log_l* troph* log_f* age_mat* adult_mob 4 4 log_l* troph* log_f* age_mat* adult_mob 4 5 log_l* troph* log_f* age_mat* adult_mob 5 1 log_l* troph* log_f* age_mat* adult_mob 5 2 log_l* troph log_f* age_mat* adult_mob 5 3 log_l* troph* log_f* age_mat* adult_mob 5 4 log_l* troph* log_f* age_mat* adult_mob 5 5 log_l* troph* log_f* age_mat* adult_mob 6 1 log_l* troph log_f* age_mat* adult_mob 6 2 log_l* troph* log_f* age_mat adult_mob 6 3 log_l* troph* log_f* age_mat* adult_mob 6 4 log_l* troph* log_f* age_mat* adult_mob 6 5 log_l* troph* log_f* age_mat* adult_mob 7 1 log_l* troph log_f* age_mat* adult_mob 7 2 log_l* troph* log_f* age_mat* adult_mob 7 3 log_l* troph* log_f* age_mat* adult_mob 7 4 log_l* troph* log_f* age_mat* adult_mob 7 5 log_l* troph* log_f* age_mat* adult_mob 8 1 log_l* troph* log_f* age_mat* adult_mob 8 2 log_l* troph* log_f* age_mat* adult_mob 8 3 log_l* troph log_f* age_mat* adult_mob 8 4 log_l* troph* log_f* age_mat* adult_mob 8 5 log_l* troph* log_f* age_mat* adult_mob 9 1 log_l* troph log_f* age_mat* adult_mob 9 2 log_l* troph* log_f* age_mat* adult_mob 9 3 log_l* troph* log_f* age_mat* adult_mob 9 4 log_l* troph* log_f* age_mat* adult_mob 9 5 log_l* troph* log_f* age_mat* adult_mob 10 1 log_l* troph* log_f* age_mat* adult_mob 10 2 log_l* troph* log_f* age_mat* adult_mob 10 3 log_l* troph* log_f* age_mat* adult_mob 10 4 log_l* troph* log_f* age_mat* adult_mob 10 5 log_l* troph* log_f* age_mat* adult_mob
iter imp variable 1 1 log_l* log_f* age_mat adult_mob 1 2 log_l* log_f* age_mat adult_mob 1 3 log_l* log_f* age_mat adult_mob 1 4 log_l* log_f* age_mat adult_mob 1 5 log_l* log_f* age_mat adult_mob 2 1 log_l* log_f* age_mat adult_mob 2 2 log_l* log_f* age_mat adult_mob 2 3 log_l* log_f* age_mat adult_mob 2 4 log_l* log_f* age_mat adult_mob 2 5 log_l* log_f* age_mat adult_mob 3 1 log_l* log_f* age_mat adult_mob 3 2 log_l* log_f* age_mat adult_mob 3 3 log_l* log_f* age_mat adult_mob 3 4 log_l* log_f* age_mat adult_mob 3 5 log_l* log_f* age_mat adult_mob 4 1 log_l* log_f* age_mat adult_mob 4 2 log_l* log_f* age_mat adult_mob 4 3 log_l* log_f* age_mat adult_mob 4 4 log_l* log_f* age_mat adult_mob 4 5 log_l* log_f* age_mat adult_mob 5 1 log_l* log_f* age_mat adult_mob 5 2 log_l* log_f* age_mat adult_mob 5 3 log_l* log_f* age_mat adult_mob 5 4 log_l* log_f* age_mat adult_mob 5 5 log_l* log_f* age_mat adult_mob 6 1 log_l* log_f* age_mat adult_mob 6 2 log_l* log_f* age_mat adult_mob 6 3 log_l* log_f* age_mat adult_mob 6 4 log_l* log_f* age_mat adult_mob 6 5 log_l* log_f* age_mat adult_mob 7 1 log_l* log_f* age_mat adult_mob 7 2 log_l* log_f* age_mat adult_mob 7 3 log_l* log_f* age_mat adult_mob 7 4 log_l* log_f* age_mat adult_mob 7 5 log_l* log_f* age_mat adult_mob 8 1 log_l* log_f* age_mat adult_mob 8 2 log_l* log_f* age_mat adult_mob 8 3 log_l* log_f* age_mat adult_mob 8 4 log_l* log_f* age_mat adult_mob 8 5 log_l* log_f* age_mat adult_mob 9 1 log_l* log_f* age_mat adult_mob 9 2 log_l* log_f* age_mat adult_mob 9 3 log_l* log_f* age_mat adult_mob 9 4 log_l* log_f* age_mat adult_mob 9 5 log_l* log_f* age_mat adult_mob 10 1 log_l* log_f* age_mat adult_mob 10 2 log_l* log_f* age_mat adult_mob 10 3 log_l* log_f* age_mat adult_mob 10 4 log_l* log_f* age_mat adult_mob 10 5 log_l* log_f* age_mat adult_mob
phyla_to_rerun <- c('cnidaria', 'mollusca')
### Drop family from prediction matrix
pred_mtx2 <- pred_mtx
pred_mtx2[ ,'family'] <- 0
for(p in phyla_to_rerun) {
### p <- phyla_to_rerun[2]
### set up file for imputed traits (all small enough for Git?)
post_impute_f <- sprintf(here_anx('mice/mice_out/order_only/gp_traits_%s_no_fam_mice.csv'), p)
### .Rdata path for saving the imputation `mids` object
impute_object_f <- sprintf(here_anx('mice/rdata/gp_traits_%s_no_fam_mice_results.RData'), p)
if(!file.exists(post_impute_f)) {
### unlink(post_impute_f)
### filter to just this phylum and drop the group column
p_traits <- traits_prepped %>%
filter(phylum == p) %>%
select(-gp, -phylum)
### impute!
ptm <- proc.time()
imp <- mice(p_traits,
m = 5, maxit = 10,
method = method_vec,
predictorMatrix = pred_mtx2,
seed = 42)
message(p, ' took ', (proc.time() - ptm)[3], ' seconds to process')
### complete the trait set and write out the traits post-imputation. For
### file size, convert wcol and adult_mob to integers instead of character
traits_complete <- complete(imp, 'long') %>%
select(-all_of(keep_ranks)) %>% ### can reattach higher ranks easily
mutate(log_l = round(log_l, 5),
log_f = round(log_f, 5),
age_mat = round(age_mat, 5))
### from above:
### wcol_levels <- c('rf', 'pel', 'ben', 'bp')
### mob_levels <- c('ses', 'sed', 'mob', 'mig')
write_csv(traits_complete, post_impute_f)
### write out results to .RData?
save(imp, file = impute_object_f)
} ### end analysis for imputation
### plot results for inspection
load(impute_object_f) ### stored as imp
cat('\n\n####', p, '\n\n') ### put in a header line
print(plot(imp)) ### do troph, log_f, and age_mat have convergence issues?
}Read in all the MICE-gapfilled results, check for any remaining gaps (i.e., where data for one trait was simply not present in an entire class or phylum under analysis). Since we are mostly concerned about species with maps, focus on those species listed in AquaMaps (i.e., gaps in non-mapped species and taxa are not important).
am_spp <- get_am_spp_info()
fs <- list.files(here_anx('mice/mice_out'), pattern = 'gp_traits', full.names = TRUE)
traits <- c('log_l', 'log_f', 'troph', 'age_mat', 'wcol', 'adult_mob')
post_mice_df <- lapply(fs, read_csv, show_col_types = FALSE) %>%
setNames(basename(fs)) %>%
bind_rows(.id = 'file') %>%
mutate(gp = str_remove_all(file, 'gp_traits_|_mice.csv'))
post_mice_spp_counts <- post_mice_df %>%
filter(species %in% am_spp$sciname) %>%
select(gp, species) %>%
distinct() %>%
group_by(gp) %>%
summarize(n_spp = n_distinct(species))
na_present <- post_mice_df %>%
filter(species %in% am_spp$sciname) %>%
filter(rowSums(is.na(.)) > 0) %>%
gather(var, val, all_of(traits)) %>%
filter(is.na(val)) %>%
select(-.imp) %>%
distinct() %>%
group_by(species, gp, var) %>%
summarize(n_instances = n()) %>%
distinct() %>%
group_by(species, gp) %>%
summarize(vars = paste(var, collapse = ';'))
na_summary <- na_present %>%
group_by(gp, vars) %>%
summarize(n_spp_na = n_distinct(species)) %>%
summarize(vars = paste(sprintf('{%s} (%s)', vars, n_spp_na), collapse = ', '),
n_spp_na = sum(n_spp_na)) %>%
full_join(post_mice_spp_counts, by = 'gp') %>%
arrange(desc(n_spp_na))
knitr::kable(na_summary)| gp | vars | n_spp_na | n_spp |
|---|---|---|---|
| mollusca_gastropoda | {adult_mob} (1), {adult_mob;log_f} (2628), {log_f} (4) | 2633 | 2635 |
| porifera | {adult_mob;log_f;troph} (4), {troph} (821) | 825 | 825 |
| cnidaria | {adult_mob;log_f} (319) | 319 | 477 |
| mollusca_bivalvia | {adult_mob} (130) | 130 | 132 |
| mollusca_polyplacophora | {adult_mob;age_mat;log_f} (88) | 88 | 88 |
| mollusca_scaphopoda | {adult_mob;age_mat;log_f} (87) | 87 | 87 |
| mollusca_cephalopoda | {adult_mob} (10) | 10 | 252 |
| chordata_myxini | {age_mat} (3) | 3 | 55 |
| chordata_actinopterygii | {adult_mob;age_mat;log_f;wcol} (1), {wcol} (1) | 2 | 11817 |
| annelida | NA | NA | 377 |
| arthropoda | NA | NA | 2774 |
| chordata_aves | NA | NA | 13 |
| chordata_elasmobranchii | NA | NA | 913 |
| chordata_mammalia | NA | NA | 120 |
| chordata_reptilia | NA | NA | 34 |
| echinodermata | NA | NA | 1207 |
| mollusca_combined | NA | NA | 3194 |
From this analysis, some clear problems to be addressed:
For adult mobility, identify Bivalvia orders that tend to be sessile vs sedentary (are any mobile residents? scallops?)
Orders:
mollusc_class_df <- post_mice_df %>%
filter(str_detect(gp, 'mollusc') & !str_detect(gp, 'combined')) %>%
arrange(species, .id, .imp) %>%
mutate(id = 1:n()) %>%
gather(trait, val, all_of(traits)) %>%
select(-file, -gp, -.id, -.imp)
mollusc_phylum_df <- post_mice_df %>%
filter(str_detect(gp, 'mollusc') & str_detect(gp, 'combined')) %>%
arrange(species, .id, .imp) %>%
mutate(id = 1:n()) %>%
gather(trait, val_p, all_of(traits)) %>%
select(-file, -gp, -.id, -.imp)
sed <- worms_spp %>%
filter(class %in% c('polyplacophora', 'scaphopoda', 'bivalvia'))
ses <- worms_spp %>%
filter(order %in% c('mytilida', 'adapedonta', 'mytiloida', 'ostreida'))
mollusc_new_df <- full_join(mollusc_class_df, mollusc_phylum_df) %>%
mutate(val = ifelse(is.na(val), val_p, val)) %>%
select(-val_p) %>%
spread(trait, val) %>%
mutate(across(c(age_mat, log_f, log_l, troph), .fns = as.numeric),
adult_mob = case_when(species %in% sed$species ~ 'sed',
species %in% ses$species ~ 'ses',
TRUE ~ adult_mob),
gp = 'mollusca')Of five classes of phylum Cnidaria, only Anthozoa has any data on fecundity and adult mobility. Cnidaria are broadcast spawners, so fill fecundity with a high value (mean across anthozoa seems reasonable). Adult mobility is more complicated - estimate by class, and for hydrozoa, by order.
# cnidaria_fam_df <- post_mice_df %>%
# filter(gp == 'cnidaria') %>%
# arrange(species, .id, .imp) %>%
# mutate(id = 1:n()) %>%
# gather(trait, val, all_of(traits)) %>%
# select(-file, -gp, -.id, -.imp)
### order-only MICE doesn't result in any fewer NAs
# cnidaria_order_df <- read_csv(here_anx('mice/mice_out/order_only/gp_traits_cnidaria_no_fam_mice.csv')) %>%
# arrange(species, .id, .imp) %>%
# mutate(id = 1:n()) %>%
# gather(trait, val_no_fam, all_of(traits)) %>%
# select(-.id, -.imp)
ses <- worms_spp %>%
filter(class %in% c('anthozoa', 'staurozoa'))
sed <- worms_spp %>%
filter(order %in% c('anthoathecata', 'leptothecata', 'actinulida'))
mob <- worms_spp %>%
filter(order %in% c('limnomedusae', 'trachymedusae', 'narcomedusae', 'siphonophorae') |
class %in% c('cubozoa', 'scyphozoa'))
cnidaria_new_df <- post_mice_df %>%
filter(gp == 'cnidaria') %>%
mutate(adult_mob = case_when(species %in% ses$species ~ 'ses',
species %in% sed$species ~ 'sed',
species %in% mob$species ~ 'mob',
TRUE ~ adult_mob),
log_f = ifelse(is.na(log_f), mean(log_f, na.rm = TRUE), log_f))post_mice_fixed_df <- post_mice_df %>%
filter(!str_detect(gp, 'mollusca|cnidaria')) %>%
bind_rows(mollusc_new_df, cnidaria_new_df) %>%
mutate(troph = ifelse(gp == 'porifera', 2, troph))
post_mice_spp_counts <- post_mice_fixed_df %>%
filter(species %in% am_spp$sciname) %>%
select(gp, species) %>%
distinct() %>%
group_by(gp) %>%
summarize(n_spp = n_distinct(species))
na_present <- post_mice_fixed_df %>%
filter(species %in% am_spp$sciname) %>%
filter(rowSums(is.na(.)) > 0) %>%
gather(var, val, all_of(traits)) %>%
filter(is.na(val)) %>%
select(-.imp) %>%
distinct() %>%
group_by(species, gp, var) %>%
summarize(n_instances = n()) %>%
distinct() %>%
group_by(species, gp) %>%
summarize(vars = paste(var, collapse = ';'))
na_summary <- na_present %>%
group_by(gp, vars) %>%
summarize(n_spp_na = n_distinct(species)) %>%
summarize(vars = paste(sprintf('{%s} (%s)', vars, n_spp_na), collapse = ', '),
n_spp_na = sum(n_spp_na)) %>%
full_join(post_mice_spp_counts, by = 'gp') %>%
arrange(desc(n_spp_na))
knitr::kable(na_summary)| gp | vars | n_spp_na | n_spp |
|---|---|---|---|
| porifera | {adult_mob;log_f} (4) | 4 | 825 |
| chordata_myxini | {age_mat} (3) | 3 | 55 |
| chordata_actinopterygii | {adult_mob;age_mat;log_f;wcol} (1), {wcol} (1) | 2 | 11817 |
| annelida | NA | NA | 377 |
| arthropoda | NA | NA | 2774 |
| chordata_aves | NA | NA | 13 |
| chordata_elasmobranchii | NA | NA | 913 |
| chordata_mammalia | NA | NA | 120 |
| chordata_reptilia | NA | NA | 34 |
| cnidaria | NA | NA | 477 |
| echinodermata | NA | NA | 1207 |
| mollusca | NA | NA | 3194 |
Take mean for each species across all imputation models. For unordered categorical variables, examine the mode. For ordered categoricals, find the mean of the observed factor levels, then round to nearest level. Note in many cases, multiple observations of a variable for a particular species resulted in multiple imputations (e.g., Gadus morhua) - these are averaged across all observations.
As an ordered categorical (sessile < sedentary < mobile resident < migratory), compare mode value to value calculated as rounded mean of imputed levels.
mob_levels = c('ses', 'sed', 'mob', 'mig')
trait_mob_ensemble <- post_mice_fixed_df %>%
mutate(adult_mob = factor(adult_mob, levels = mob_levels)) %>%
group_by(species, adult_mob) %>%
mutate(n_mob = n()) %>%
group_by(species) %>%
summarize(n_vals = n(),
mob_mean = mean(as.numeric(adult_mob)),
mob_sd = sd(as.numeric(adult_mob)),
mob_mode = first(adult_mob[n_mob == max(n_mob)]),
pct_mob = max(n_mob) / n_vals) %>%
mutate(mob_mean = mob_levels[round(mob_mean)],
match = mob_mean == mob_mode)
trait_mob_ensemble %>%
select(mob_mean, mob_mode) %>%
table()## mob_mode
## mob_mean ses sed mob mig
## mig 0 0 2077 2990
## mob 0 564 23445 741
## sed 1533 5160 3444 0
## ses 11401 0 0 0
Water column position can be considered an unordered variable: while benthic < benthopelagic < pelagic, reef does not neatly fall into that order. Select mode from imputed values.
trait_wcol_ensemble <- post_mice_fixed_df %>%
group_by(species, wcol) %>%
mutate(n_wcol = n()) %>%
group_by(species) %>%
summarize(n_vals = n(),
wcol_mode = paste(unique(wcol[n_wcol == max(n_wcol)]), collapse = ';'),
pct_wcol = max(n_wcol) / n_vals)
trait_wcol_ensemble$wcol_mode %>% table()## .
## ben bp NA pel rf
## 28423 3875 39 4243 14791
Trophic level, log(length), log(fecundity), and age to maturity - find mean and sd.
traits_num_ensemble <- post_mice_fixed_df %>%
group_by(species) %>%
summarize(across(c(log_l, log_f, age_mat, troph), .fns = list(mean = mean, sd = sd)),
n_vals = n())
plot_df <- traits_num_ensemble %>%
gather(trait, val, -species, -n_vals)
ggplot(plot_df, aes(x = val)) +
theme_minimal() +
geom_histogram(bins = 15) +
facet_wrap(~ trait, ncol = 2, scales = 'free')Select mean mobility, mode water column position, mean log(length), mean log(fecundity), mean trophic level, mean age to maturity.
traits_ensemble <- trait_mob_ensemble %>%
select(species, adult_mob = mob_mean) %>%
full_join(trait_wcol_ensemble %>%
select(species, wcol = wcol_mode), by = 'species') %>%
full_join(traits_num_ensemble %>%
select(species, log_l = log_l_mean, log_f = log_f_mean,
age_mat = age_mat_mean, troph = troph_mean),
by = 'species')
write_csv(traits_ensemble, here('_data/grouping_traits_post_imputation.csv'))